##Introduction The goal of this project is to predict the manner in which they did the exercise. This is the “classe” variable in the training set.

# Data Setes Dtasetes are avialable on the belove-mentioned links:-

For tarining- https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv

For test- https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv

I am going to use the below-mentioned R- Packages for the EDA, datacleaning and model buildings-

 require(knitr)
## Loading required package: knitr
 require(caret)
## Loading required package: caret
## Warning: package 'caret' was built under R version 4.0.4
## Loading required package: lattice
## Loading required package: ggplot2
 require(rpart)
## Loading required package: rpart
 require(rpart.plot)
## Loading required package: rpart.plot
## Warning: package 'rpart.plot' was built under R version 4.0.4
 require(randomForest)
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
 require(caTools)
## Loading required package: caTools
 require(Amelia)
## Loading required package: Amelia
## Warning: package 'Amelia' was built under R version 4.0.5
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.6, built: 2019-11-24)
## ## Copyright (C) 2005-2021 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
 require(devtools)
## Loading required package: devtools
## Loading required package: usethis
 require(ggcorrplot)
## Loading required package: ggcorrplot
## Warning: package 'ggcorrplot' was built under R version 4.0.5
 require(plotly)
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
 require(correlationfunnel)
## Loading required package: correlationfunnel
## Warning: package 'correlationfunnel' was built under R version 4.0.5
## == correlationfunnel Tip #3 ====================================================
## Using `binarize()` with data containing many columns or many rows can increase dimensionality substantially.
## Try subsetting your data column-wise or row-wise to avoid creating too many columns.
## You can always make a big problem smaller by sampling. :)

##Loading Data

# file.choose function help to load the data from anywhere in your system
tr_data <- read.csv(file.choose(),stringsAsFactors = T, header = T )
ts_data <- read.csv(file.choose(),stringsAsFactors = T, header = T )
missmap(tr_data) # To check the missing values

missmap(ts_data)

dim(tr_data)
## [1] 19622   160
dim(ts_data)
## [1]  20 160

##Data Cleansing

After checking the datasets, i have found that there are some missing values in both datasets,and some of variables having Nearly Zero Variance. Now we need to

nzv <- nearZeroVar(tr_data)
tr_data <- tr_data[, -nzv]
ts_data <- ts_data[, -nzv]
dim(tr_data)
## [1] 19622   100
dim(ts_data)
## [1]  20 100
AllNA <- sapply(tr_data, function(x) mean(is.na(x))) > 0.95 #Removing Variables which are having NA values, Our threshhold is 95%
tr_data <- tr_data[, AllNA==FALSE]
dim(tr_data)
## [1] 19622    59
ts_data <- ts_data[, AllNA==FALSE]
dim(ts_data)
## [1] 20 59
#Removing the first 7 Variables which are Non-Numeric. 
tr_data <- tr_data[, 8:59]
ts_data <- ts_data[, 8:59]
dim(tr_data)
## [1] 19622    52
dim(ts_data)
## [1] 20 52
colnames(tr_data)
##  [1] "pitch_belt"           "yaw_belt"             "total_accel_belt"    
##  [4] "gyros_belt_x"         "gyros_belt_y"         "gyros_belt_z"        
##  [7] "accel_belt_x"         "accel_belt_y"         "accel_belt_z"        
## [10] "magnet_belt_x"        "magnet_belt_y"        "magnet_belt_z"       
## [13] "roll_arm"             "pitch_arm"            "yaw_arm"             
## [16] "total_accel_arm"      "gyros_arm_x"          "gyros_arm_y"         
## [19] "gyros_arm_z"          "accel_arm_x"          "accel_arm_y"         
## [22] "accel_arm_z"          "magnet_arm_x"         "magnet_arm_y"        
## [25] "magnet_arm_z"         "roll_dumbbell"        "pitch_dumbbell"      
## [28] "yaw_dumbbell"         "total_accel_dumbbell" "gyros_dumbbell_x"    
## [31] "gyros_dumbbell_y"     "gyros_dumbbell_z"     "accel_dumbbell_x"    
## [34] "accel_dumbbell_y"     "accel_dumbbell_z"     "magnet_dumbbell_x"   
## [37] "magnet_dumbbell_y"    "magnet_dumbbell_z"    "roll_forearm"        
## [40] "pitch_forearm"        "yaw_forearm"          "total_accel_forearm" 
## [43] "gyros_forearm_x"      "gyros_forearm_y"      "gyros_forearm_z"     
## [46] "accel_forearm_x"      "accel_forearm_y"      "accel_forearm_z"     
## [49] "magnet_forearm_x"     "magnet_forearm_y"     "magnet_forearm_z"    
## [52] "classe"

EDA

To check the correlation among the predictors

corr <- round(cor(tr_data[,-52]),1)
plot1 <- ggcorrplot(corr, hc.order = TRUE, type = "lower",
            outline.col = "white",
            ggtheme = ggplot2::theme_gray,
            colors = c("#6D9EC1", "white", "#E46726"))
ggplotly(plot1)
## Correlation with classe__A
tr_data %>% binarize(n_bins = 4,thresh_infreq = 0.01) %>%
         correlate(target = classe__A) %>%
    plot_correlation_funnel(interactive = T,limits =
                                    c(-0.5,0.5))
## Correlation with classe__B
tr_data %>% binarize(n_bins = 4,thresh_infreq = 0.01) %>%
         correlate(target = classe__B) %>%
    plot_correlation_funnel(interactive = T,limits =
                                    c(-0.5,0.5))
## Correlation with classe__C
tr_data %>% binarize(n_bins = 4,thresh_infreq = 0.01) %>%
         correlate(target = classe__C) %>%
    plot_correlation_funnel(interactive = T,limits =
                                    c(-0.5,0.5))
## Correlation with classe__D
tr_data %>% binarize(n_bins = 4,thresh_infreq = 0.01) %>%
         correlate(target = classe__D) %>%
    plot_correlation_funnel(interactive = T,limits =
                                    c(-0.5,0.5))
## Correlation with classe__E
tr_data %>% binarize(n_bins = 4,thresh_infreq = 0.01) %>%
         correlate(target = classe__E) %>%
    plot_correlation_funnel(interactive = T,limits =
                                    c(-0.5,0.5))

##Data Partitioning

For the cross validation or to check the accuracy of our model we need to divide training dataset further. For this i am foing to use CAtools package.

sample = sample.split(tr_data, SplitRatio = .75)
train_dt = subset(tr_data, sample == TRUE)
test_dt  = subset(tr_data, sample == FALSE)

Decision Tree Model

set.seed(123)
fit <- rpart(classe ~ ., data = train_dt, method="class")
pred <- predict(fit,test_dt,type = "class")
confusionMatrix(pred,test_dt$classe)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1217  136    7   31   34
##          B   48  516   39   62  120
##          C   29   77  622  111  130
##          D   77  171  130  559  114
##          E   23   50   57   42  503
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6966          
##                  95% CI : (0.6836, 0.7095)
##     No Information Rate : 0.2842          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6168          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.8730   0.5432   0.7275   0.6944   0.5583
## Specificity            0.9408   0.9320   0.9143   0.8800   0.9570
## Pos Pred Value         0.8540   0.6573   0.6419   0.5319   0.7452
## Neg Pred Value         0.9491   0.8947   0.9408   0.9362   0.9059
## Prevalence             0.2842   0.1937   0.1743   0.1641   0.1837
## Detection Rate         0.2481   0.1052   0.1268   0.1140   0.1025
## Detection Prevalence   0.2905   0.1600   0.1976   0.2143   0.1376
## Balanced Accuracy      0.9069   0.7376   0.8209   0.7872   0.7577

As we can see that accuracy lavel of decision tree is 70%,which is not upto the desired level. So we need to check other models to compare the accuracy with this model.

##Random Forest Model

set.seed(123)
rf_fit <- randomForest(classe ~ ., data = train_dt,mtyr=7)
rf_pred <- predict(rf_fit,test_dt)
cnf_rf <- confusionMatrix(rf_pred,test_dt$classe)
        plot(cnf_rf$table,col =cnf_rf$byClass,color="blue",main=paste("Random Forest-Accuracy=",
                        round(cnf_rf$overall['Accuracy'], 4)))
## Warning: In mosaicplot.default(x, xlab = xlab, ylab = ylab, ...) :
##  extra argument 'col' will be disregarded

After checking the Overall Statistics data, the Random Forest model has definitely more accuracy than Decision tree model. Hence we will be selecting Random Forest model for final prediction

rf_ts_pred <- predict(rf_fit,ts_data)
rf_ts_pred
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
##  B  A  B  A  A  E  D  B  A  A  B  C  B  A  E  E  A  B  B  B 
## Levels: A B C D E